#############    CHARACTER ACTIONS     #################
#Code accompanying the article: 
#Andrew Piper, "What do characters do? The embodied agency of fictional characters," JCLS 2023
#all texts processed through bookNLP large model: https://github.com/booknlp/booknlp

##########   CHARACTER ACTION EXTRACTION   #############
#Extracting character actions and associated supersenses
#input = the bookNLP .book file and .supersense file
#output = a table of verb and supersense types by book
library(stringr)
library(rjson)

#get root directory
wd.root<-c("")
setwd(wd.root)

#get list of folders (i.e. books)
filenames<-list.files()

#create empty table
final.df<-NULL

#for every book
for (i in 1:length(filenames)){
  
  print(i)
  
  #setwd to the i-th book
  wd.file<-paste(wd.root, filenames[i], sep="")
  setwd(wd.file)
  
  #list bookNLPfiles
  book.files<-list.files()
  
  #check if folder isn't empty
  #if (length(book.files) == 6){
  
  #ingest .book file (json format)
  #a<-fromJSON(file = book.files[grep(".book", book.files)])
  a<-fromJSON(file = book.files[which(str_sub(book.files, start= -5) == ".book")])
  
  #create empty vectors
  agent.w.v<-vector()
  agent.i.v<-vector()
  
  #for every character
  for (j in 1:length(a$characters)){
    #print(j)
    #access j-th character
    b<-lapply(a,'[[',j)
    
    #access agent role
    c<-lapply(b, '[[',1)
    
    #check if empty
    if (length(c$characters) > 0){
      
      #for every role as agent
      for (k in 1:length(c$characters)){
        
        #access token
        agent.w<-c$characters[[k]]$w
        
        #access token number
        agent.i<-c$characters[[k]]$i
        
        #store in vectors
        agent.w.v<-append(agent.w.v, agent.w)
        agent.i.v<-append(agent.i.v, agent.i)
      }
    }
  }
  
  #now for every character in the book we have a vector of all
  #actions associated with those characters in the agent position
  
  #create dataframe 
  agent.df<-data.frame(agent.w.v, agent.i.v)
  
  #find supersense and store in table
  super<-book.files[grep(".supersense", book.files)]
  super.df<-read.csv(super, quote="", sep="\t")
  
  #subset supersense by agent.df
  super.df<-super.df[super.df$start_token %in% agent.df$agent.i.v | super.df$end_token %in% agent.df$agent.i.v,]
  #this is the final list of verb types linking characters
  
  #store with metadata
  fileID<-filenames[i]
  interaction.type<-super.df$supersense_category
  interaction.token<-super.df$text
  temp.df<-data.frame(fileID, interaction.type, interaction.token)
  final.df<-rbind(final.df, temp.df)
  #}
}

#remove rows with nouns
final.df2<-final.df[grep("verb", final.df$interaction.type),]

#setwd("")
write.csv(final.df2, file="CharacterActionsAll.csv", row.names = F)

#tabulate counts by supersense type
test<-tapply(final.df2$interaction.type, final.df2$fileID, table)

#turn into dataframe
type.df<-NULL
for (i in 1:nlevels(factor(final.df2$fileID))){
  print(i)
  file.name<-names(test[i])
  verb.types<-test[[i]]
  temp.df<-data.frame(file.name, verb.types)
  type.df<-rbind(type.df, temp.df)
}

#save
#setwd("")
write.csv(type.df, file="CharacterActionsTypes.df.csv", row.names = F)

###### DISTINCTIVE TYPE ANALYSIS ########
#compares verb types by category (fiction / non-fiction) 

#load metadata
setwd("")
meta<-read.csv("CONLIT_META.csv")

#load type data
setwd("")
type<-read.csv("CharacterActionsTypes.df.csv")

#check matching
which(!levels(factor(type$file.name)) %in% meta$ID)

#compare by category

#FIC / NON
fic<-type[type$file.name %in% meta$ID[meta$Category == "FIC"], ]
non<-type[type$file.name %in% meta$ID[meta$Category == "NON"], ]
fic.df<-data.frame(tapply(fic$Freq, fic$Var1, sum))
non.df<-data.frame(tapply(non$Freq, non$Var1, sum))
colnames(fic.df)<-c("frequency")
colnames(non.df)<-c("frequency")
which(as.character(fic.df$Var1) != as.character(non.df$Var1))

#entropy function
H = function(k) {N = sum(k); return(sum(k/N*log(k/N+(k==0))))}

#get total counts
all1<-sum(fic.df$frequency)
all2<-sum(non.df$frequency)

#order
fic.df$types<-row.names(fic.df)
non.df$types<-row.names(non.df)
fic.df<-fic.df[order(fic.df$types),]
non.df<-non.df[order(non.df$types),]
which(fic.df$types != non.df$types)

#get individual counts
word1<-fic.df$frequency
word2<-non.df$frequency

#store empty results in a table
results <- data.frame(word = fic.df$types, 
                      fic=fic.df$frequency,
                      non=non.df$frequency,
                      G2 = 0,
                      fisher.OR = 0,
                      fisher.p = 0)

#create loop to go through every type
for (j in 1:nrow(results)){
  #create contingency table for each word
  cont.table<-data.frame(c(word1[j], all1-word1[j]), c(word2[j], all2-word2[j]))
  #get straight odds ratio
  fish<-fisher.test(cont.table)
  #Dunning's
  LLR = 2*sum(cont.table)*(H(cont.table)-H(rowSums(cont.table))-H(colSums(cont.table)))
  results$G2[j] = LLR
  results$fisher.OR[j] = fish$estimate
  results$fisher.p[j] = fish$p.value
}
#sort by G2 score
results$G2_Sorted<-vector(mode="numeric", length=nrow(results))
for (i in 1:nrow(results)){
  if (results$fisher.OR[i] < 1){
    results$G2_Sorted[i]<--results$G2[i]
  } else {
    results$G2_Sorted[i]<-results$G2[i]
  }
}
results<-results[order(-results$G2_Sorted),]

setwd("")
write.csv(results, file="CharacterActions_Results_CONLIT.csv", row.names = F)

#############    EFFECT SIZE ANALYSIS   ##############
library(effsize)

#load metadata
setwd("")
meta<-read.csv("CONLIT_META.csv")

#load type data
setwd("")
type<-read.csv("CharacterActionsTypes.df.csv")

#check matching
which(!levels(factor(type$file.name)) %in% meta$ID)

######## Analysis 1: Effect Size on "Embodiment" and "Cognition" Classes #########

#establish verb subset
verb<-c("verb.motion", "verb.contact", "verb.body")

#subset type data by these verbs
sub<-type[type$Var1 %in% verb,]

#sum these 4 types per book
sub.sum<-tapply(sub$Freq, sub$file.name, sum)

#normalize
#divide by total # character actions (not total tokens, i.e. what share are these
#actions of total character actions, not what share are these actions of the novel). The reason
#why is that genres may have different levels of characterization so lower levels of an
#action may be a sign of lower levels of characterization not the action
action.count<-tapply(type$Freq, type$file.name, sum)
sub.sum<-sub.sum/action.count
#check matching
which(names(sub.sum) != meta$ID)
#attach to metadata
meta$embodiment<-unname(sub.sum)

#redo for cognition
#estabish verb subset
verb<-c("verb.cognition", "verb.emotion")
sub<-type[type$Var1 %in% verb,]
sub.sum<-tapply(sub$Freq, sub$file.name, sum)
action.count<-tapply(type$Freq, type$file.name, sum)
sub.sum<-sub.sum/action.count
which(names(sub.sum) != meta$ID)
meta$cognition<-unname(sub.sum)

#effect size
hist(meta$embodiment)
cohen.d(meta$embodiment[meta$Category == "FIC"], meta$embodiment[meta$Category == "NON"])
hist(meta$cognition)
cohen.d(meta$cognition[meta$Category == "FIC"], meta$cognition[meta$Category == "NON"])

#plot by category (FIC / NON)
library(reshape2)
df1<-meta[,colnames(meta) %in% c("Category", "embodiment", "cognition", "Genre")]
df<-melt(df1, id.vars=c("Category", "Genre"))
df$value<-as.vector(df$value)
df$Class<-factor(paste(df$Category, df$variable, sep=":"))
levels(df$Class)<-c("FIC:c", "FIC:e", "NON:c", "NON:e")
df$Class<-ordered(df$Class, levels = c("FIC:e", "NON:e", "FIC:c", "NON:c"))

#### FIGURE ####
library(ggplot2)
ggplot(df, aes(x=Class, y=value, fill=variable)) +
  geom_boxplot() +
  scale_fill_manual(values=c("#56B4E9", "#E69F00")) +
  #scale_fill_brewer(palette="Dark2") +
  theme_classic() + 
  xlab("Category") +
  ylab("Fraction of all verb types") +
  theme(legend.title = element_blank()) +
  labs(caption="Source: CONLIT")

#plot by genre (Embodiment Only)
#subset by embodiment variable
library(forcats)
df2<-df[df$variable == "embodiment",]
#add "youth category"
df2$Category2<-df2$Category
df2$Category2[df2$Genre == "YA"]<-c("YOUTH")
df2$Category2[df2$Genre == "MID"]<-c("YOUTH")
df2$Genre<-fct_reorder(df2$Genre, df2$value)

#boxplot
ggplot(df2, aes(x=Genre, y=value, fill=Category2)) +
  geom_boxplot() +
  #scale_fill_manual(values=c("#56B4E9", "#E69F00")) +
  #scale_fill_brewer(palette="Dark2") +
  theme_classic() + 
  xlab("Category") +
  ylab("Fraction of all verb types") +
  theme(legend.title = element_blank()) +
  labs(caption="Source: CONLIT")

#######  Analysis 2: Individual Scores  #######
#This analysis isn't shown in the paper**
#This focuses on the top 4 embodied actions across fictional genres to observe between genre differences

#just for fiction
fic<-meta[meta$Category == "FIC",]

#choose verb
verb<-c("verb.contact")
#verb<-c("verb.body")
#verb<-c("verb.perception")
#verb<-c("verb.motion")

#get frequnecy
sub<-type[type$Var1 %in% verb,]
sub.sum<-tapply(sub$Freq, sub$file.name, sum)
action.count<-tapply(type$Freq, type$file.name, sum)
sub.sum<-sub.sum/action.count
sub.sum<-sub.sum[names(sub.sum) %in% fic$ID]
which(names(sub.sum) != fic$ID)

#add to respective column
fic$contact<-unname(sub.sum)
#fic$body<-unname(sub.sum)
#fic$perception<-unname(sub.sum)
#fic$motion<-unname(sub.sum)

#remove Romance
fic2<-fic[fic$Genre != "ROM",]

#analysis of variance
#contact
summary(aov(contact ~ Genre, data=fic2))
model<-aov(contact ~ Genre, data=fic2)
plot(TukeyHSD(model, conf.level=.95), las = 2)
#body
summary(aov(body ~ Genre, data=fic2))
model<-aov(body ~ Genre, data=fic2)
plot(TukeyHSD(model, conf.level=.95), las = 2)
cohen.d(fic2$body[fic2$Genre == "MID"], fic2$body[fic2$Genre == "NYT"])
#perception
summary(aov(perception ~ Genre, data=fic2))
model<-aov(perception ~ Genre, data=fic2)
plot(TukeyHSD(model, conf.level=.95), las = 2)
cohen.d(fic2$perception[fic2$Genre == "YA"], fic2$perception[fic2$Genre == "NYT"])
cohen.d(fic2$perception[fic2$Genre == "YA"], fic2$perception[fic2$Genre == "SF"])
cohen.d(fic2$perception[fic2$Genre == "BS"], fic2$perception[fic2$Genre == "SF"])
#motion
summary(aov(motion ~ Genre, data=fic2))
model<-aov(motion ~ Genre, data=fic2)
plot(TukeyHSD(model, conf.level=.95), las = 2)
boxplot(motion ~ Genre, data=fic2)

##############    HATHI1M DATA   #################
setwd("")
hathi<-read.csv("Hathi1M_YearlyData.csv")

#combine cognition verbs and embodiment verbs
embodiment<-hathi$verb.body+hathi$verb.contact+hathi$verb.motion
cognition<-hathi$verb.cognition+hathi$verb.emotion
#metadata to keep
keep1<-c("year", "genre")
#combine
h<-cbind(hathi[,which(colnames(hathi) %in% keep1)], cognition, embodiment)
#turn into long form
h2<-melt(h, id.vars=c("year", "genre"))

#then take yearly difference for each variable between FIC and NON
# h.fic<-h[h$genre == "FIC",]
# h.non<-h[h$genre == "NON",]
# body.diff<-h.fic$embod-h.non$embod
# cog.diff<-h.fic$cog-h.non$cog
# cog.df<-data.frame(h.fic$year, cog.diff)
# cog.df$variable<-c("Cognition")
# body.df<-data.frame(h.fic$year, body.diff)
# body.df$variable<-c("Embodiment")
# colnames(cog.df)<-c("year", "value", "variable")
# colnames(body.df)<-c("year", "value", "variable")
# df<-rbind(cog.df, body.df)

#plot
ggplot(h2, aes(x=year, y=value, color = variable, linetype = genre)) +
  #geom_point() +
  geom_line() +
  scale_colour_manual(values=c("#E69F00", "#56B4E9")) +
  #theme_bw() + 
  theme_classic() + 
  xlab("Year") +
  ylab("Frequency") +
  #guides(linetype = FALSE) +
  #theme(legend.position="bottom") +
  theme(legend.title = element_blank()) +
  theme(legend.text = element_text(size=10))+
  labs(caption="Source: HATHI1M")

#Plot body actions
h2<-hathi[hathi$genre == "FIC",]
keep<-c("year", "genre", "verb.motion", "verb.contact", "verb.body")
h2<-h2[,which(colnames(h2) %in% keep)]
h3<-melt(h2, id.vars=c("year", "genre"))

ggplot(h3, aes(x=year, y=value, color = variable)) +
  #geom_point() +
  geom_line() +
  #scale_colour_brewer(palette="Dark2")+
  #scale_colour_manual(values=c("#00AFBB", "#E7B800", "#FC4E07", "#52854C")) +
  #scale_colour_manual(values=c("#F0E442", "#0072B2", "#D55E00", "#CC79A7")) +
  #theme_bw() + 
  theme_classic() + 
  xlab("Year") +
  ylab("Frequency") +
  theme(legend.title = element_blank()) +
  labs(caption="Source: HATHI1M")

#Calculate relative change mid19C - late 20C
c19<-h2[which(h2$year > 1801 & h2$year < 1850),]
c20<-h2[which(h2$year > 1949 & h2$year < 2000),]
mean(c20$verb.motion)-mean(c19$verb.motion)
mean(c20$verb.contact)-mean(c19$verb.contact)
mean(c20$verb.motion)/mean(c19$verb.motion)
mean(c20$verb.contact)/mean(c19$verb.contact)
mean(c20$verb.body)/mean(c19$verb.body)


#############   TOKEN ANALYSIS    ##############

##### Analysis 1: what are the most frequent words by category? #####

#load metadata
setwd("")
meta<-read.csv("CONLIT_META.csv")

#load token data
setwd("")
a<-read.csv("CharacterActionsAll.csv")

a<-a[which(a$fileID %in% meta$ID[meta$Category == "FIC"]),]

#lemmatize
library(textstem)

#for each verb type, what are the most frequent tokens
final.df<-NULL
for (i in 1:nlevels(factor(a$interaction.type))){
  b<-a[a$interaction.type == levels(factor(a$interaction.type))[i],]
  c<-data.frame(sort(table(b$interaction.token), decreasing = T))
  c$lemmas<-lemmatize_strings(c$Var1)
  tokens<-sort(tapply(c$Freq, c$lemmas, sum), decreasing = T)[1:5]
  tokens<-paste(names(tokens), collapse=",")
  type<-levels(factor(a$interaction.type))[i]
  temp.df<-data.frame(type, tokens)
  final.df<-rbind(final.df, temp.df)
}

setwd("")
write.csv(final.df, file="CharacterActions_VerbType_TopTokens.csv", row.names = F)


##### Analysis 2: Focus on specific verb types #################

#load metadata
setwd("")
meta<-read.csv("CONLIT_META.csv")

#load token data
setwd("")
a<-read.csv("CharacterActionsAll.csv")

#subset by FIC
b<-a[a$fileID %in% meta$ID[meta$Category == "FIC"],]

#remove ROM
c<-b[!b$fileID %in% meta$ID[meta$Genre == "ROM"],]

#Find most frequent tokens for top verb types
top<-c("verb.contact", "verb.body", "verb.perception", "verb.motion", "verb.cognition", "verb.social", "verb.emotion", "verb.communication", "verb.possession", "verb.change", "verb.creation", "verb.stative")

#contact
sub<-c[c$interaction.type == top[1],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Contact_Tokens.csv", row.names = F)

#body
sub<-c[c$interaction.type == top[2],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Body_Tokens.csv", row.names = F)

#perception
sub<-c[c$interaction.type == top[3],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Perception_Tokens.csv", row.names = F)

#motion
sub<-c[c$interaction.type == top[4],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Motion_Tokens.csv", row.names = F)

#cognition
sub<-c[c$interaction.type == top[5],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Cognition_Tokens.csv", row.names = F)

#social
sub<-c[c$interaction.type == top[6],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Social_Tokens.csv", row.names = F)

#emotion
sub<-c[c$interaction.type == top[7],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Emotion_Tokens.csv", row.names = F)

#communication
sub<-c[c$interaction.type == top[8],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Communication_Tokens.csv", row.names = F)

#possession
sub<-c[c$interaction.type == top[9],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Possession_Tokens.csv", row.names = F)

#change
sub<-c[c$interaction.type == top[10],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Change_Tokens.csv", row.names = F)

#creation
sub<-c[c$interaction.type == top[11],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Creation_Tokens.csv", row.names = F)

#stative
sub<-c[c$interaction.type == top[12],]
out<-sort(table(sub$interaction.token), decreasing = T)
out<-out[out > 5]
out[1:100]
write.csv(out, file="CharacterActions_Stative_Tokens.csv", row.names = F)

########  Analysis 2: Most distinctive words ############
#Here we compare fictional actions to non-fiction
#We condition on the four embodied verb types

#load metadata
setwd("")
meta<-read.csv("CONLIT_META.csv")

#load token data
setwd("")
a<-read.csv("CharacterActionsAll.csv")

#subset by each of the 4 embodied actions
#b<-a[a$interaction.type == "verb.body",]
#b<-a[a$interaction.type == "verb.contact",]
#b<-a[a$interaction.type == "verb.motion",]
b<-a[a$interaction.type == "verb.perception",]

#split into fic and non
fic<-b[b$fileID %in% meta$ID[meta$Category == "FIC"],]
non<-b[b$fileID %in% meta$ID[meta$Category == "NON"],]

#remove ROM
fic<-fic[!fic$fileID %in% meta$ID[meta$Genre == "ROM"],]

#table each word to get frequency per word
fic.token<-data.frame(table(fic$interaction.token))
non.token<-data.frame(table(non$interaction.token))

#entropy function
H = function(k) {N = sum(k); return(sum(k/N*log(k/N+(k==0))))}

#get total word counts for each genre
all1<-sum(fic.token$Freq)
all2<-sum(non.token$Freq)

#subset, order and match
fic.token$Var1<-as.character(fic.token$Var1)
non.token$Var1<-as.character(non.token$Var1)
fic.token<-fic.token[order(-fic.token$Freq),]
non.token<-non.token[order(-non.token$Freq),]
#keep top 1000 of each
fic.token<-fic.token[1:1000,]
non.token<-non.token[1:1000,]
#merge
all.token<-merge(fic.token, non.token, by="Var1", all=T)
#remove 0
all.token[is.na(all.token)]<-0
#remove rows with 0s
all.token<-all.token[which(all.token$Freq.x != 0 & all.token$Freq.y != 0),]

#store empty results in a table
results <- data.frame(word = all.token$Var1, 
                      fic=all.token$Freq.x,
                      non=all.token$Freq.y,
                      G2 = 0,
                      fisher.OR = 0,
                      fisher.p = 0)

#create loop to go through every type
for (j in 1:nrow(results)){
  #create contingency table for each word
  cont.table<-data.frame(c(results[j,2], all1-results[j,2]), c(results[j,3], all2-results[j,3]))
  #get straight odds ratio
  fish<-fisher.test(cont.table)
  #Dunning's
  LLR = 2*sum(cont.table)*(H(cont.table)-H(rowSums(cont.table))-H(colSums(cont.table)))
  results$G2[j] = LLR
  results$fisher.OR[j] = fish$estimate
  results$fisher.p[j] = fish$p.value
}
#sort by G2 score
results$G2_Sorted<-vector(mode="numeric", length=nrow(results))
for (i in 1:nrow(results)){
  if (results$fisher.OR[i] < 1){
    results$G2_Sorted[i]<--results$G2[i]
  } else {
    results$G2_Sorted[i]<-results$G2[i]
  }
}
results<-results[order(-results$G2_Sorted),]

#setwd("")
write.csv(results, file="CharacterActions_MDW_Perception.csv", row.names = F)


############   Lemmatize Words #########################
library(textstem)
setwd("")
a<-read.csv("CharacterActions_Perception_Tokens.csv")
a$lemmas<-lemmatize_strings(a$Var1)
b<-sort(tapply(a$Freq, a$lemmas, sum), decreasing = T)
setwd("")
write.csv(b[1:100], file="CharacterActions_Perception_Lemmas.csv")


#############     MANUAL VALIDATION OF SUPER SENSE TYPES    ###############

library(stringr)
library(rjson)

######### SAMPLE SENTENCES ######### 
#outputs 1 sample sentence from 1000 books
#includes supersense type for human annotation

setwd("")
meta<-read.csv("CONLIT_META.csv")

#get root directory
wd.root<-c("")
setwd(wd.root)

#get list of folders (i.e. books)
filenames<-list.files()

#subset by FIC
filenames<-filenames[filenames %in% meta$ID[meta$Category == "FIC"]]

#subsample 1000 books
filenames<-sample(filenames, 1000)

#make empty final table
all.df<-NULL

#for every book
for (i in 1:length(filenames)){
  
  print(i)
  
  #setwd to the i-th book
  wd.file<-paste(wd.root, filenames[i], sep="")
  setwd(wd.file)
  
  #list bookNLPfiles
  book.files<-list.files()
  
  #check if folder isn't empty
  #if (length(book.files) == 6){
  
  #ingest supersense table
  super<-book.files[grep(".supersense", book.files)]
  super.df<-read.csv(super, quote="", sep="\t")
  
  #ingest .book file (json format)
  #a<-fromJSON(file = book.files[grep(".book", book.files)])
  a<-fromJSON(file = book.files[which(str_sub(book.files, start= -5) == ".book")])
  
  #access top character
  b<-lapply(a,'[[',1)
  #access agent role
  c<-lapply(b, '[[',1)
  
  for (m in 1:length(c$characters)){
    
    #randomly sample from character list of actions
    k<-sample(length(c$characters), 1)
    #access token
    agent.w<-c$characters[[k]]$w
    #access token number
    agent.i<-c$characters[[k]]$i  
    #if token has supersense then break
    if (agent.i %in% super.df$start_token){
      break
    }
  }
  
  #make data frame
  agent.df<-data.frame(agent.w, agent.i)
  
  #now load tokens tables
  tokens<-book.files[grep(".tokens", book.files)]
  tokens.df<-read.csv(tokens, quote="", sep="\t")
  
  #subset sentences by random selection from mind/body tables
  sent<-tokens.df$sentence_ID[tokens.df$token_ID_within_document == agent.i]
  
  #add prior sentence
  sent<-append(sent, sent-1)
  sent.sample<-tokens.df[tokens.df$sentence_ID %in% sent,]
  
  #add to table
  text<-paste(sent.sample$word, collapse = " ")
  token<-agent.w
  type<-super.df$supersense_category[super.df$start_token == agent.i]
  file<-filenames[i]
  temp.df<-data.frame(file, type, token, text)
  all.df<-rbind(all.df, temp.df)
}

#########  Inter-Annotator Agreement ##########
library(irr)

#Ingest validation table
setwd("")
a.df<-read.csv("Validation_Agreement_Table.csv")

#calculate agreement
#all annotators
kappam.fleiss(a.df[,6:8])
#pairwise
kappa2(a.df[,6:7])
kappa2(a.df[,c(6,8)])
kappa2(a.df[,c(7,8)])

#how many expert resolutions went against the majority? Answer = 31 out of 87
count<-0
for (i in 1:nrow(a.df)){
  #if there are more than one answer
  if (length(table(unlist(a.df[i,6:8]))) > 1){
    #if a4 agrees with minority vote
    if(unname(table(unlist(a.df[i,6:8]))[which(a.df[i,10] == names(table(unlist(a.df[i,6:8]))))])<2){
      count<-count+1
    }
  }
}

#### Calculate Accuracy ####

#this code used to tag the annotations for TPs, FPs, TNs, FNs
#already included in the agreeement table

# #first tag each annotation for accuracy wrt to category
# #cognition
# a.df$cognition<-vector(mode="character", length=nrow(a.df))
# for (i in 1:nrow(a.df)){
#   #if booknlp = cognition & a4 = cognition(tp)
#   if (a.df$type[i] %in% c("verb.cognition", "verb.emotion") & a.df$a4[i] == c("c")){
#     a.df$cognition[i]<-c("tp")
#   #if booknlp = cognition and a4 != cognition (fp)
#   } else if (a.df$type[i] %in% c("verb.cognition", "verb.emotion") & a.df$a4[i] != c("c")){
#     a.df$cognition[i]<-c("fp")
#   #if booknlp != cognition and a4 = cognition (fn)
#   } else if (!a.df$type[i] %in% c("verb.cognition", "verb.emotion") & a.df$a4[i] == c("c")){
#     a.df$cognition[i]<-c("fn")
#   #if booknlp != cognition and a4 != cognition (tn)
#   } else {
#     a.df$cognition[i]<-c("tn")
#   }
# }
# 
# #motion
# a.df$motion<-vector(mode="character", length=nrow(a.df))
# for (i in 1:nrow(a.df)){
#   #if booknlp = cognition & a4 = cognition(tp)
#   if (a.df$type[i] %in% c("verb.body", "verb.contact", "verb.motion") & a.df$a4[i] == c("m")){
#     a.df$motion[i]<-c("tp")
#     #if booknlp = cognition and a4 != cognition (fp)
#   } else if (a.df$type[i] %in% c("verb.body", "verb.contact", "verb.motion") & a.df$a4[i] != c("m")){
#     a.df$motion[i]<-c("fp")
#     #if booknlp != cognition and a4 = cognition (fn)
#   } else if (!a.df$type[i] %in% c("verb.body", "verb.contact", "verb.motion") & a.df$a4[i] == c("m")){
#     a.df$motion[i]<-c("fn")
#     #if booknlp != cognition and a4 != cognition (tn)
#   } else {
#     a.df$motion[i]<-c("tn")
#   }
# }

#Accuracy Functions
precision.f<-function(x){
  sum(x=="tp")/(sum(x=="tp")+sum(x=="fp"))
}
recall.f<-function(x){
  sum(x=="tp")/(sum(x=="tp")+sum(x=="fn"))
}
f1.f<-function(x){
  2*((precision.f(x)*recall.f(x))/(precision.f(x)+recall.f(x)))
}

precision.f(a.df$cognition)
recall.f(a.df$cognition)
f1.f(a.df$cognition)

precision.f(a.df$motion)
recall.f(a.df$motion)
f1.f(a.df$motion)

#review errors
#False negatives
test<-a.df[a.df$motion == "fn",]
test<-a.df[a.df$cognition == "fn",]
table(test$type)

#False positives
test<-a.df[a.df$motion == "fp",]
test<-a.df[a.df$cognition == "fp",]

########  Test Associations between Supersense and POV ########
setwd("")
super<-read.csv("CONLIT_SUPERSENSE.csv")
meta<-read.csv("CONLIT_META.csv")
super<-super[order(super$file_name),]
meta<-meta[order(meta$ID),]
which(super$file_name != meta$ID)
super<-super[,2:42]/meta$token_count
meta$body<-super$verb.body+super$verb.contact+super$verb.motion
meta$cog<-super$verb.cognition+super$verb.emotion
meta.sub<-meta[meta$Probability1P %in% c(1,0),]
meta.sub$Probability1P<-factor(meta.sub$Probability1P)
summary(lm(body ~ Probability1P+Genre, data=meta.sub))
summary(lm(cog ~ Probability1P+Genre, data=meta.sub))


